% !TEX program = xelatex
%%%%%%%%%%%%%%%%%%%%%%%%

\input{preamble.tex}
\leoslide

\subtitle{Multi-Agent Reinforcement Learning in Games: First Steps and Challenges}

\begin{document}
\maketitle

\introslide

\begin{frame}{\outline}

\blist
    \item Learning framework for MARL
    \item Independent learning
    \item Central learning
    \item Modes of learning
    \item Challenges of MARL
\elist
\end{frame}

\section{MARL Learning Framework}

\begin{frame}[t]{MARL Learning Process}

\begin{figure}
    \centering
    \includegraphics[width=0.8\textwidth, height = 0.8\textheight, keepaspectratio]{images/chapter_5/learning-process.pdf}
\end{figure}

\blist
    \item The game model defines the learning environment
    \item Interaction data from joint policy $\pol^z$ are collected as $\mathcal{D}^{z} = \{ h^{t_e} \mid e = 1, \ldots, z\}, z \geq 0$
    \item A learning algorithm updates the joint policy as $\pi^{z+1} = \mathbb{L}(\mathcal{D}^{z}, \pi^{z})$
    \item The learning goal is a chosen solution concept, e.g. Nash equilibrium
\elist
    
\end{frame}

% \begin{frame}{MARL Learning Process - Continued}

% There are several additional nuances to the MARL learning process.

% \blist
%     \item The chosen game model will determine what the learnt joint policy conditions on:
%     \vspace{-15pt}
%     \begin{itemize}
%         \item Normal-form games: simple probability distribution across actions
%         \item Repeated normal-form games: condition on action histories $h^t = (a^0, ..., a^{t–1})$
%         \item Stochastic games: condition on state-action histories $h^t = (s^0, a^0, s^1, a^1, ..., s^t)$
%         \item POSG: condition on independent observation histories $h^t_i = (o_i^0, ..., o^t_i)$
%     \end{itemize}
%     \item Policies can however be adjusted as required e.g. in a stochastic game only condition on the last state-action 
%     \item The learning algorithm $\mathbb{L}$ can consists of multiple learning algorithms e.g. $\mathbb{L}_i \forall i \in I$
%     \item Each $\mathbb{L}_i$ may use different parts of the data, or use independently collected data
% \elist

% \end{frame}

\begin{frame}[t]{Inputs of Policies Depend on Game Model}

% The chosen game model will determine what the learnt joint policy conditions on:

    \begin{minipage}[t]{0.39\linewidth}
        % First Column
        \centering
        \vspace{0.5cm}
        \visible<1->{
            \includegraphics[width=0.8\linewidth, keepaspectratio]{images/chapter_4/nf_inputs-1.pdf}
            \textbf{Normal-form games}
        }
        \vspace{1.5cm}
        \visible<2->{
            \includegraphics[width=\linewidth]{images/chapter_4/rnf_input-1.pdf} 
            \textbf{Repeated normal-form \\ games}
        }
    \end{minipage}
    \hfill
    \begin{minipage}[t]{0.59\linewidth}
        \centering
        \visible<3->{
            \includegraphics[width=0.75\linewidth]{images/chapter_4/sg_input-1.pdf}\\
            \textbf{Stochastic games}
        }
        \vspace{0.5cm}
        \visible<4->{
            \includegraphics[width=0.75\linewidth]{images/chapter_4/posg_input-1.pdf}\\
            \textbf{Partially observable stochastic games}
        }
    \end{minipage}

\end{frame}

\begin{frame}{Convergence}

To evaluate the learning algorithm, we typically assess whether the learnt joint policy has \fat{converged} to an optimal joint policy:
\vspace{0pt}
\[
\lim_{z \to \infty}\pi^z = \pi^*
\]

\blist
    \item Optimal joint policies may differ depending on the solution concept
    \item There may be many valid solutions $\Rightarrow$ e.g. multiple Nash equilibria
    \item In practice, we cannot collect infinite data $z \to \infty$
    \listtab Learning typically stops after a predefined 'budget' (e.g. training steps)
    % \item Alternative theoretical evaluation criteria include:
    % \begin{itemize}
    %     \item Convergence of average return
    %     \item Convergence of empirical distribution of the learn policy to the optimal policy
    % \end{itemize}
\elist
    
\end{frame}


\begin{frame}[t]{Single-Agent RL Reductions}

The simplest way to apply RL algorithms in multi-agent settings is to reduce them to single-agent problems. 

\vspace{10pt}

\visible<2->{
\e{\bf Central learning:}
\blist
    \item Apply one single-agent RL algorithm to control all agents centrally 
    \listtab A central policy is learned over the joint action space
\elist}

\visible<3->{
\e{\bf Independent learning:}
\blist
    \item Apply single-agent RL algorithms to each agent independently
	\listtab Agents do not explicitly consider or represent each other
\elist}
    
\end{frame}

\section{Central Learning}

\begin{frame}{Central Learning}

\e{Central learning:} learn a single central policy $\pi_c$ which receives observations of all agents and selects an action for each agent (i.e. joint action $(\ac_1,...,\ac_n)$). 

\blist
    \item<1-> Requires transforming the joint reward $(r_1, \ldots, r_n)$ into a single scalar reward $r$
    \item<2-> This can be easy in some settings, e.g. in games with common rewards $r = r_i$, but difficult in zero-sum or general-sum games
    \item<3-> May not scale well with the number of agents, as the joint action space may grow exponentially with the number of agents
    \item<4-> May also not be suitable in environments that require agents to act independently based on local observations
\elist
\end{frame}

\begin{frame}{Central Q-Learning}


\balg{Central Q-learning}{cql}
\State Initialize: \( Q(s, a) = 0 \) for all \( s \in S \) and \( a \in A = A_1 \times \ldots \times A_n \)
\State Repeat for every episode:
\For{\( t = 0, 1, 2, \ldots \)}
    \State Observe current state \( s^t \)
    \State With probability \( \epsilon \): choose random joint action \( a^t \in A \)
    \State Otherwise: choose joint action \( a^t \in \arg\max_a Q(s^t, a) \)
    \State Apply joint action \( a^t \), observe rewards \( r_1^t, \ldots, r_n^t \) and next state \( s^{t+1} \)
    \State Transform \( r_1^t, \ldots, r_n^t \) into scalar reward \( r^t \)
    \State \( Q(s^t, a^t) \leftarrow Q(s^t, a^t) + \alpha [ r^t + \gamma \max_{a'} Q(s^{t+1}, a') - Q(s^t, a^t) ] \)
\EndFor
\ealg

\end{frame}

\section{Independent Learning}

\begin{frame}{Independent Learning}

\e{Independent learning:} each agent $i$ learns its policy $\pi_i$ using only its local history of observations.

\blist
    \item From the perspective of each agent $i$, the environment transition function looks like this:
\elist
\[
\mathcal{T}_i(s^{t+1} | s^t, a_i) \propto \sum_{a_{-i} \in A_{-i}} \mathcal{T}(s^{t+1} | s^t, \langle a_i, a_{-i} \rangle) \prod_{j \neq i} \pi_j(a_j | s^t)
\]
% \vspace{5pt}
\blist
    \item As each agent $j$'s policies are updated, the action probabilities $\pi_j$ change
    \listtab From agent $i$'s perspective, the transition function $\mathcal{T}_i$ appears non-stationary!
\elist
    
\end{frame}

\begin{frame}{Independent Q-learning}

\balg{Independent Q-learning (IQL) for stochastic games}{iql}
\State // \textit{Algorithm controls agent} \( i \)
\State Initialize: \( Q_i(s, a_i) = 0 \) for all \( s \in S \), \( a_i \in A_i \)
\State Repeat for every episode:
\For{\( t = 0, 1, 2, \ldots \)}
    \State Observe current state \( s^t \)
    \State With probability \( \epsilon \): choose random action \( a_i^t \in A_i \)
    \State Otherwise: choose action \( a_i^t \in \arg\max_{a_i} Q_i(s^t, a_i) \)
    \State (meanwhile, other agents \( j \neq i \) choose their actions \( a_j^t \))
    \State Observe own reward \( r_i^t \) and next state \( s^{t+1} \)
    \State \( Q_i(s^t, a_i^t) \leftarrow Q_i(s^t, a_i^t) + \alpha [ r_i^t + \gamma \max_{a'_i} Q_i(s^{t+1}, a'_i) - Q_i(s^t, a_i^t) ] \)
\EndFor
\ealg
\end{frame}

\begin{frame}{IQL and CQL in Level-Based Foraging}

\begin{columns}
    \begin{column}{0.5\textwidth}
    \begin{figure}
        \centering
        \includegraphics[width=\textwidth,height=.8\textheight,keepaspectratio]{images/environments/lbf/tabular_marl_lbf_annot.pdf}
        \label{fig:enter-label}
        
    \end{figure}
        
    \end{column}
    
    \begin{column}{0.5\textwidth}
    \vspace{10pt}
        \centering
        \includegraphics[width=\textwidth,height=.8\textheight,keepaspectratio]{images/chapter_5/tabular_marl_lbf_returns_iql_cql.pdf}
        \label{fig:enter-label}
        \blist
            \item IQL can learn more quickly, as CQL needs to explore $6^2 = 36$ actions in each state
        \elist
        
    \end{column}
\end{columns}
    
\end{frame}

\begin{frame}{Modes of Operation in MARL}

Modes of operation in MARL:

\vspace{5pt}

\fat{Self-play:}
\blist
    \item {\it Algorithm self-play:} all agents use the same learning algorithm (and parameters)
    \item {\it Policy self-play:} agent's policy is trained directly against itself
\elist

\vspace{5pt}

\fat{Mixed-play:}
\blist
    \item Agents use different learning algorithms
    % \item Trading bots in markets that use RL are one example of a mixed-play scenario
    % \item Ad-hoc teamwork is another area of research that focuses on this setting where agents must collaborate with previously unknown agents
\elist
    
\end{frame}

\section{MARL Challenges}

\begin{frame}{MARL Challenges}

\small 

\bcol
	\tcol{0.47}
		\begin{redtitlebox}{Singe-Agent RL Challenges}
		\blist
		    \item Unknown environment dynamics
		    \item Exploration-exploitation dilemma
		    \item Non-stationarity from bootstrapping
		    \item Temporal credit assignment
		\elist
		\end{redtitlebox}
		
	\tcol{0.5}
		\visible<2->{
		\begin{bluetitlebox}{Multi-Agent RL Challenges}
		\blist
		    \item Non-stationarity from multiple learning agents
   		    \item Equilibrium selection
		    \item Multi-agent credit assignment
		    \item Scaling to many agents
		\elist
		\end{bluetitlebox}
		}
\ecol    
\end{frame}

\begin{frame}{Non-Stationarity}

\underline{A stochastic process ${X^t}_{t \in \mathbb{N}^0}$ is stationary if:}

\begin{itemize}
    \item The probability distribution of $X^{t + \tau}$ does not depend on $\tau \in \mathbb{N}^0$, where t and $t + \tau$ are time indices
    \item This means that the dynamics of the process do not change over time
\end{itemize}

\pause

\underline{Consider: $X^t$ samples the state $s^t$ at each time step $t$:}

\begin{itemize}
    \item<2-> In a MDP, $X^t$ is completely defined by the state transition function $\mathcal{T}(s^t|s^{t-1}, a^{t-1})$ and the agent's policy $\pi$ which selects an action $a \sim \pi(.|s)$
    \item<3-> If $\pi$ does not change, then this process is stationary (i.e. independent of $t$) because $s^t$ depends only on $s^{t-1}$, $a^{t-1}$ and $a^{t-1}$ depends only on  $s^{t-1}$ via $\pi(.|s^{t-1})$
    \item<4-> However, in RL the policy does change over time through the learning $\pi^{z+1} = \mathbb{L}(\mathcal{D}^z, \pi^z)$,  which leads to \fat{non-stationarity} in $X^t$
\end{itemize}

\end{frame}

\begin{frame}[t]{Non-Stationarity in Multi-Agent Settings}

In MARL, non-stationarity is exacerbated by multiple agents changing their policies!

\vspace{15pt}

\bcol
	\col{0.45}
        \includegraphics[width=0.95\textwidth]{images/chapter_5/wolfphc_rps_own.pdf}

	\col{0.45}
		\blist
		    \item $\pi^{z+1} = \mathbb{L}(\mathcal{D}^z, \pi^z)$ updates an {\it entire} joint policy $\pi^z = (\pi_1^z, ..., \pi_n^z)$
		    \item Environment is non-stationary from each agent's perspective
		    \item Can cause cyclic learning dynamics where agents co-adapt to each other's changing policies 
		    % \item Stochastic approximation conditions that ensure convergence in single-agent RL don't apply in MARL
		\elist

\ecol
\end{frame}

\begin{frame}{Equilibrium Selection}

\e{Equilibrium selection:} a game may have multiple equilibria, which can yield different expected returns to the agents.
\vspace{5pt}

\bcol
    \col{0.5}
        \blist
            \item<1-> {\bf Example:} Stag Hunt matrix game
            \item<1-> Two hunters choose: cooperate to hunt stag (S) or go solo for hare (H)
            \item<2-> Nash equilibria: reward-dominant (S,S) maximizes reward, risk-dominant (H,H) minimizes risk
            \item<3-> (S,S) requires trust; (H,H) offers a safe, lower reward
        \elist

    \col{0.5}
        \centering
        \gamestaghunt
		\vspace{10pt}
		\visible<4->{
        \blist
            \item Indep. Q-learning may bias towards (H,H) due to initial action uncertainty
            \item Early random actions can reinforce (H,H) since deviating from H is penalized if the other agent chooses H
        \elist}

\ecol
    
\end{frame}

\begin{frame}{Multi-Agent Credit Assignment}

%Credit assignment in single-agent RL refers to determining which past action contributes to receiving rewards.
\e{Multi-agent credit assignment:} which agent's actions contributed to received rewards? \\[15pt]

\bcol
    \col{0.5}
            \centering
            \includegraphics[width=0.85\textwidth]{images/environments/lbf/foraging_8x12_b.png}

    \col{0.5}
        \blist
            \item At time step $t$ all agents perform \textit{collect}, each receiving reward $+1$
            \item Whose actions led to the reward?
            \item The agent on the left did not contribute
            \item For a learning agent that only observes $s^t, a^t, r^t, s^{t+1}$, disentangling the action contributions is difficult
        \elist

\ecol
    
\end{frame}


\begin{frame}[t]{Joint Actions for Addressing Multi-Agent Credit Assignment}

\fat{Joint actions} can help disentangle agent contributions. Consider the RPS game:

\vspace{5pt}

\centering
\gamerps 

\vspace{5pt}

\begin{enumerate}
    \item<2-> Agents choose actions $(a_1, a_2) = (R, S)$ $\quad\Rightarrow$ agent $1$ receives reward $+1$
    \item<3-> Then agents choose $(a_1, a_2) = (R, P)$ $\quad\Rightarrow$ agent $1$ receives reward $-1$
    \item<4-> Action value $Q(a_1)$ does not model agent $2$, value for action $R$ appears to be 0
    \item<5-> \e{Joint action value model} $Q_1(a_1, a_2)$ can represent the effect of agent $2$: $Q_1(R, S)=+1$ and $Q_1(R, P)=-1$
\end{enumerate}

\end{frame}

\begin{frame}{Scaling to Many Agents}

    \begin{problembox}
	    \blist
	        \item \textbf{Joint action space} can grow \textbf{exponentially} with number of agents:
	        	$$|A| = |A_1| \cdot ... \cdot |A_n|$$
	        \item If agents have associated features in $s$ (e.g. agent position) then $|S|$ also increases exponentially
	        %\item In CL, this increases the decision problem, while in IL this increases issues of non-stationarity and multi-agent credit assignment
	        \listtab How to scale efficiently to many agents?
	    \elist
    \end{problembox}

\end{frame}

\begin{frame}{Scaling to Many Agents}
	    
	\bcol
	    \col{0.45}
	        \includegraphics[width=0.9\textwidth]{images/environments/lbf/foraging_8x12_b.png}
	        
	    \col{0.45}
	        %\blist
	        	Changing number of agents from 3 to 5 increases the number of joint actions from 216 to 7776!
	        %\elist
	\ecol

\end{frame}

\begin{frame}{Summary}

\fat{We covered:}
    \blist
        \item MARL learning process
        \item Independent and central learning
        \item Modes of operation in MARL
        \item Challenges of MARL
    \elist

\fat{Next we'll cover:}

    \blist
        \item Foundational algorithms in MARL
    \elist
    
\end{frame}

\end{document}

